{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Algoritmos de Classificação e Regressão"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### - SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Erro = 0.38198757764\n"
     ]
    }
   ],
   "source": [
    "from pyspark.mllib.classification import SVMWithSGD, SVMModel\n",
    "from pyspark.mllib.regression import LabeledPoint\n",
    "\n",
    "# Constroi estrutura LabeledPoint a partir dos dados\n",
    "def parsePoint(line):\n",
    "    values = [float(x) for x in line.split(' ')]\n",
    "    return LabeledPoint(values[0], values[1:])\n",
    "\n",
    "data = sc.textFile(\"data/sample_svm_data.txt\")\n",
    "parsedData = data.map(parsePoint)\n",
    "\n",
    "# Constroi o modelo SVM utilizando Gradiente Descendente Estocástico\n",
    "model = SVMWithSGD.train(parsedData, iterations=100)\n",
    "\n",
    "# Faz as predições\n",
    "labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))\n",
    "trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())\n",
    "print(\"Erro = \" + str(trainErr))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### - Decision Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Porcentagem de Erro = 0.0769230769231\n",
      "Modelos de classificação gerados:\n",
      "DecisionTreeModel classifier of depth 1 with 3 nodes\n",
      "  If (feature 406 <= 20.0)\n",
      "   Predict: 0.0\n",
      "  Else (feature 406 > 20.0)\n",
      "   Predict: 1.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.mllib.tree import DecisionTree, DecisionTreeModel\n",
    "from pyspark.mllib.util import MLUtils\n",
    "\n",
    "# Carrega os dados\n",
    "data = MLUtils.loadLibSVMFile(sc, 'data/sample_libsvm_data.txt')\n",
    "# Divide os dados em treino (70%) e teste (30%)\n",
    "(trainingData, testData) = data.randomSplit([0.7, 0.3])\n",
    "\n",
    "# Treina um classificador DecisionTree.\n",
    "# categoricalFeaturesInfo vazio indica que todas as features são valores contínuos\n",
    "# caso houvesse alguma feature categórica, categoricalFeaturesInfo = {0:2}, \n",
    "# indicando que a feature 0 tem 2 categorias\n",
    "model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},\n",
    "                                     impurity='gini', maxDepth=5, maxBins=32)\n",
    "\n",
    "# Faz as predições\n",
    "predictions = model.predict(testData.map(lambda x: x.features))\n",
    "labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)\n",
    "testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())\n",
    "print('Porcentagem de Erro = ' + str(testErr))\n",
    "print('Modelos de classificação gerados:')\n",
    "print(model.toDebugString())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### - Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Error = 0.030303030303\n",
      "Learned classification forest model:\n",
      "TreeEnsembleModel classifier with 3 trees\n",
      "\n",
      "  Tree 0:\n",
      "    If (feature 511 <= 0.0)\n",
      "     If (feature 402 <= 0.0)\n",
      "      Predict: 1.0\n",
      "     Else (feature 402 > 0.0)\n",
      "      Predict: 0.0\n",
      "    Else (feature 511 > 0.0)\n",
      "     Predict: 0.0\n",
      "  Tree 1:\n",
      "    If (feature 401 <= 0.0)\n",
      "     If (feature 370 <= 0.0)\n",
      "      Predict: 1.0\n",
      "     Else (feature 370 > 0.0)\n",
      "      Predict: 0.0\n",
      "    Else (feature 401 > 0.0)\n",
      "     Predict: 0.0\n",
      "  Tree 2:\n",
      "    If (feature 399 <= 0.0)\n",
      "     If (feature 494 <= 0.0)\n",
      "      If (feature 434 <= 0.0)\n",
      "       Predict: 0.0\n",
      "      Else (feature 434 > 0.0)\n",
      "       Predict: 1.0\n",
      "     Else (feature 494 > 0.0)\n",
      "      If (feature 606 <= 0.0)\n",
      "       Predict: 0.0\n",
      "      Else (feature 606 > 0.0)\n",
      "       Predict: 1.0\n",
      "    Else (feature 399 > 0.0)\n",
      "     Predict: 0.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.mllib.tree import RandomForest, RandomForestModel\n",
    "from pyspark.mllib.util import MLUtils\n",
    "\n",
    "data = MLUtils.loadLibSVMFile(sc, 'data/sample_libsvm_data.txt')\n",
    "# Divide os dados em treino (70%) e teste (30%)\n",
    "(trainingData, testData) = data.randomSplit([0.7, 0.3])\n",
    "\n",
    "# Treina um classificador DecisionTree.\n",
    "# Número de Decision Tree a serem geradas: 3. \n",
    "model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},\n",
    "                                     numTrees=3, featureSubsetStrategy=\"auto\",\n",
    "                                     impurity='gini', maxDepth=4, maxBins=32)\n",
    "\n",
    "# Faz a predição\n",
    "predictions = model.predict(testData.map(lambda x: x.features))\n",
    "labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)\n",
    "testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())\n",
    "print('Porcentagem de Erro = ' + str(testErr))\n",
    "print('Modelos de classificação gerados:')\n",
    "print(model.toDebugString())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Pyspark (Py 2)",
   "language": "",
   "name": "pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
